#########################################################################
### Unexpected Responses on Amazon Mechanical Turk                    ###
### Survey with Indian Respondents Only                               ###
#########################################################################

library(rgeolocate)
library(ggmap)
library(revgeo)
library(ngram)
library(MASS)
library(pscl)
library(stargazer)
library(xtable)
library(Rfast)
library(BMA)
source('imageplotbma.R')

survey_full<-read.csv("IndiaMTurkSurvey2019.csv", header=T, stringsAsFactors = F)

survey<-survey_full[survey_full$Progress>97,]


#Mark Invalid Responses
survey$HoursWorkedOutside<-survey$HoursWorked-survey$HoursWorkedMTurk
survey$HoursWorkedInvalid<-ifelse(survey$HoursWorkedOutside<0,1,NA)

#Must select 3 on the Attention Check to pass
survey$AttentionCheckInvalid<-ifelse(survey$AttentionCheck!=3,1,NA)
survey$FindOutCoded<-ifelse(survey$FindOutCoded=="Invalid",1,NA)
survey$StartHelpInvalid<-ifelse(survey$StartHelpCoded=="Invalid",1,NA)

#Requirement was to complete at least 1000 HITs
survey$HITsCompletedInvalid<-ifelse(survey$HITsCompleted<1000,1,NA)
survey$StateInvalid<-ifelse(survey$StateCoded=="United States" |
                              survey$StateCoded=="India" | survey$StateCoded=="Invalid",
                            1,NA)

survey$InvalidCount<-rowSums(survey[,c("HoursWorkedInvalid",
                                       "HITsCompletedInvalid",
                                       "AttentionCheckInvalid", "TimingInvalid",
                                       "StateInvalid")], na.rm=TRUE)
survey$Invalid<-ifelse(survey$InvalidCount!=0,1,0)


#Geolocate respondent's state

#address<-mapply(FUN=function(Longitude, Latitude) 
#  revgeo(Longitude,Latitude,output='hash')[4],
#  survey$Longitude, survey$Latitude)

#address2 <- data.frame(matrix(unlist(address), nrow=length(address), byrow=T),
#                       stringsAsFactors = F)
#write.csv(address2,'address2.csv',row.names=F)

address2<-read.csv("address2.csv",header=T,stringsAsFactors = F)
colnames(address2)<-c("IPState")
address2$IPState<-ifelse(address2$IPState=="State Not Found",NA,address2$IPState)
survey$IPState<-address2$IPState

#Code Tamil Nadu
survey$IPStateTN<-ifelse(survey$IPState=="Tamil Nadu",1,0)

#Code number of duplicated or seemingly copied responses
survey$FlagCount<-rowSums(survey[,c("StartHelpFlag", "TimingFlag")], na.rm=TRUE)
survey$Flag<-ifelse(survey$FlagCount!=0,1,0)

#Timing flag (for a duplicate response on the describe your typical day question)
surveyflag<-survey[survey$TimingFlag==1,]
surveyflag<-surveyflag[rowSums(is.na(surveyflag)) != ncol(surveyflag), ]
responses<-surveyflag[c("IP.Address", "TimingText")]



### Descriptive Statistics
survey$FindOutFriends<-ifelse(survey$FindOut==1,1,0)
survey$FindOutWeb<-ifelse(survey$FindOut==7,1,0)
survey$StartHelpFriend<-ifelse(survey$StartHelpCoded=="Friend",1,0)
survey$StartHelpFriend<-ifelse(is.na(survey$StartHelpFriend),0,survey$StartHelpFriend)
survey$HelpFriend<-ifelse(is.na(survey$HelpFriend),0,1)
survey$HelpFamily<-ifelse(is.na(survey$HelpFamily),0,1)
survey$DiscoverThisHITFriend<-ifelse(survey$DiscoverThisHIT==4,1,0)

#Valid respondents write more words
survey$TimingWords<-as.numeric(as.character(lapply(FUN=function(i) wordcount(i), survey$TimingText)))

#Recode age reported in birth year
survey$Age<-ifelse(survey$Age==1992, 27, survey$Age)
survey$Age<-ifelse(survey$Age==1991, 28, survey$Age)

#Code caste as a factor
survey$Caste<-as.factor(survey$Caste)
survey$CasteOBC<-ifelse(survey$Caste=="5",1,0)

survey$NativeLanguageEnglish<-ifelse(survey$NativeLanguageCoded=="English"|
                                       survey$NativeLanguageCoded=="Tamil and English",1,0)



###Predicting Invalid responses
survey$DiscoverWhatsApp<-ifelse(is.na(survey$DiscoverWhatsApp),0,survey$DiscoverWhatsApp)
survey$DiscoverFriend<-ifelse(is.na(survey$DiscoverFriend),0,survey$DiscoverFriend)
survey$GroupWhatsApp<-ifelse(is.na(survey$GroupWhatsApp),0,survey$GroupWhatsApp)
survey$GroupFacebook<-ifelse(is.na(survey$GroupFacebook),0,survey$GroupFacebook)
survey$GroupNone<-ifelse(is.na(survey$GroupNone),0,survey$GroupNone)
survey$LocationHome<-ifelse(survey$Location==1,1,0)
survey$StartMotivationMoney<-ifelse(survey$StartMotivation==4,1,0)


#Timing Model
model_x<-survey[c("Female","MasterWorker",
                  "Age","Married","Months",
                  "GroupWhatsApp", "GroupFacebook", "ShareFriend", "ShareFamily",
                  "ShareFrequency", "AcademicSurveys", "LocationHome", "LocationOtherWorkers",
                  "LocationOthersHousehold", "StartMotivationMoney","TalkFrequency", "MTurkPrimaryJob",
                  "FindOutFriends", "FindOutWeb", "KnowOnMTurk", "TalkMTurk",
                  "HelpJoinNumber", "HelpFamily", "HelpFriend", "WorkerFriendships",
                  "DiscoverFriend", "DiscoverWhatsApp", "CasteOBC", "IncomeLadder", "NativeLanguageEnglish",
                  "StartHelp")]


#Table SI.2.1: Descriptives for everyone
data_summary<-as.data.frame(matrix(nrow=31, ncol=6))
for(i in 1:31){
  data_summary[i,1]<-colnames(model_x[i])
  data_summary[i,2]<-min(model_x[,i], na.rm=T)
  data_summary[i,3]<-max(model_x[,i], na.rm=T)
  data_summary[i,4]<-sqrt(var(model_x[,i], na.rm=T))
  data_summary[i,5]<-mean(model_x[,i], na.rm=T)
  data_summary[i,6]<-median(model_x[,i], na.rm=T)
}
print(xtable(data_summary, type='latex', row.names=F), file='summary.tex', include.rownames=FALSE)


#Table SI.2.2: Descriptives for everyone
surveyvalid<-survey[survey$Invalid==0,]
valid_x<-surveyvalid[c("Female","MasterWorker",
                  "Age","Married","Months",
                  "GroupWhatsApp", "GroupFacebook", "ShareFriend", "ShareFamily",
                  "ShareFrequency", "AcademicSurveys", "LocationHome", "LocationOtherWorkers",
                  "LocationOthersHousehold", "StartMotivationMoney","TalkFrequency", "MTurkPrimaryJob",
                  "FindOutFriends", "FindOutWeb", "KnowOnMTurk", "TalkMTurk",
                  "HelpJoinNumber", "HelpFamily", "HelpFriend", "WorkerFriendships",
                  "DiscoverFriend", "DiscoverWhatsApp", "CasteOBC", "IncomeLadder", "NativeLanguageEnglish",
                  "StartHelp")]

data_summary<-as.data.frame(matrix(nrow=31, ncol=6))
for(i in 1:31){
  data_summary[i,1]<-colnames(valid_x[i])
  data_summary[i,2]<-min(valid_x[,i], na.rm=T)
  data_summary[i,3]<-max(valid_x[,i], na.rm=T)
  data_summary[i,4]<-sqrt(var(valid_x[,i], na.rm=T))
  data_summary[i,5]<-mean(valid_x[,i], na.rm=T)
  data_summary[i,6]<-median(valid_x[,i], na.rm=T)
}
print(xtable(data_summary, type='latex', row.names=F), file='summary.tex', include.rownames=FALSE)



#Timing model
timing_y<-survey$TimingInvalid

#timingmodel<- bic.glm(model_x, timing_y, strict = FALSE, OR = 20,
#                      glm.family="binomial", factor.type=TRUE)
#save(timingmodel, file='timingmodel.RData')
load("timingmodel.RData")
summary(timingmodel)
imageplot.bma(timingmodel)

timingmodel2<-glm(TimingInvalid~Age+LocationOtherWorkers+
                    LocationOthersHousehold+MTurkPrimaryJob+KnowOnMTurk+
                    WorkerFriendships, survey, family="binomial")
timingmodel3<-glm(TimingInvalid~AcademicSurveys+Age+LocationOtherWorkers+
                    LocationOthersHousehold+MTurkPrimaryJob+ShareFriend+
                    LocationHome+StartMotivationMoney+FindOutFriends+
                    KnowOnMTurk+WorkerFriendships+NativeLanguageEnglish+
                    MasterWorker+Months+DiscoverFriend+HelpFamily+
                    StartHelp+ShareFrequency, survey, family="binomial")
summary(timingmodel2)
exp(coef(timingmodel2))


#HoursWorkedInvalid
hours_y<-survey$HoursWorkedInvalid

#hoursmodel<- bic.glm(model_x, hours_y, strict = FALSE, OR = 20,
#                      glm.family="binomial", factor.type=TRUE)
#save(hoursmodel, file='hoursmodel.RData')
load("hoursmodel.RData")

summary(hoursmodel)


hoursmodel2<-glm(HoursWorkedInvalid~MasterWorker+Months+DiscoverFriend+StartHelp
                 , survey, family="binomial")
hoursmodel3<-glm(HoursWorkedInvalid~AcademicSurveys+Age+LocationOtherWorkers+
                   LocationOthersHousehold+MTurkPrimaryJob+ShareFriend+
                   LocationHome+StartMotivationMoney+FindOutFriends+
                   KnowOnMTurk+WorkerFriendships+NativeLanguageEnglish+
                   MasterWorker+Months+DiscoverFriend+HelpFamily+
                   StartHelp+ShareFrequency
                 , survey, family="binomial")
summary(hoursmodel2)
exp(coef(hoursmodel2))



#HITsCompletedInvalid Model
HITs_y<-survey$HITsCompletedInvalid

#HITsmodel<- bic.glm(model_x, HITs_y, strict = FALSE, OR = 20,
#                      glm.family="binomial", factor.type=TRUE)
#save(HITsmodel, file='HITsmodel.RData')
load("HITsmodel.RData")

summary(HITsmodel)
imageplot.bma(HITsmodel)

HITsmodel2<-glm(HITsCompletedInvalid~Age+ShareFriend+LocationHome+StartMotivationMoney+
                  FindOutFriends+KnowOnMTurk+HelpFamily+StartHelp
                   , survey, family="binomial")
HITsmodel3<-glm(HITsCompletedInvalid~AcademicSurveys+Age+LocationOtherWorkers+
                  LocationOthersHousehold+MTurkPrimaryJob+ShareFriend+
                  LocationHome+StartMotivationMoney+FindOutFriends+
                  KnowOnMTurk+WorkerFriendships+NativeLanguageEnglish+
                  MasterWorker+Months+DiscoverFriend+HelpFamily+
                  StartHelp+ShareFrequency
                , survey, family="binomial")
summary(HITsmodel2)
exp(coef(HITsmodel2))


#AttentionCheckInvalid Model
attention_y<-survey$AttentionCheckInvalid

#attentionmodel<- bic.glm(model_x, attention_y, strict = FALSE, OR = 20,
#                      glm.family="binomial", factor.type=TRUE)
#save(attentionmodel, file='attentionmodel.RData')

load("attentionmodel.RData")
summary(attentionmodel)
imageplot.bma(attentionmodel)

attentionmodel2<-glm(AttentionCheckInvalid~Months+ShareFrequency+AcademicSurveys+
                       LocationOthersHousehold+WorkerFriendships, survey, family="binomial")
attentionmodel3<-glm(AttentionCheckInvalid~AcademicSurveys+Age+LocationOtherWorkers+
                       LocationOthersHousehold+MTurkPrimaryJob+ShareFriend+
                       LocationHome+StartMotivationMoney+FindOutFriends+
                       KnowOnMTurk+WorkerFriendships+NativeLanguageEnglish+
                       MasterWorker+Months+DiscoverFriend+HelpFamily+
                       StartHelp+ShareFrequency, survey, family="binomial")
summary(attentionmodel2)
exp(coef(attentionmodel2))


#StateInvalid Model
state_y<-survey$StateInvalid

#statemodel<- bic.glm(model_x, state_y, strict = FALSE, OR = 20,
#                      glm.family="binomial", factor.type=TRUE)
#save(statemodel, file="statemodel.RData")

load("statemodel.RData")
summary(statemodel)
imageplot.bma(statemodel)

statemodel2<-glm(StateInvalid~Months+LocationOthersHousehold
                     , survey, family="binomial")
statemodel3<-glm(StateInvalid~AcademicSurveys+Age+LocationOtherWorkers+
                   LocationOthersHousehold+MTurkPrimaryJob+ShareFriend+
                   LocationHome+StartMotivationMoney+FindOutFriends+
                   KnowOnMTurk+WorkerFriendships+NativeLanguageEnglish+
                   MasterWorker+Months+DiscoverFriend+HelpFamily+
                   StartHelp+ShareFrequency
                 , survey, family="binomial")
summary(statemodel2)
exp(coef(statemodel2))

#Overall Invalid
invalid_y<-survey$Invalid

#invalidmodel<- bic.glm(model_x, invalid_y, strict = FALSE, OR = 20,
#                     glm.family="binomial", factor.type=TRUE)
#save(invalidmodel, file="invalidmodel.RData")

load("invalidmodel.RData")
summary(invalidmodel)
par(mar=c(5.1, 4.1, 10.1, 2.1))
imageplot.bma(invalidmodel, order="probne0")


invalidmodel2<-glm(Invalid~AcademicSurveys+LocationOtherWorkers+MTurkPrimaryJob+
                     WorkerFriendships+NativeLanguageEnglish
                 , survey, family="binomial")
invalidmodel3<-glm(Invalid~AcademicSurveys+Age+LocationOtherWorkers+
                     LocationOthersHousehold+MTurkPrimaryJob+ShareFriend+
                     LocationHome+StartMotivationMoney+FindOutFriends+
                     KnowOnMTurk+WorkerFriendships+NativeLanguageEnglish+
                     MasterWorker+Months+DiscoverFriend+HelpFamily+
                     StartHelp+ShareFrequency
                   , survey, family="binomial")
summary(invalidmodel2)
exp(coef(invalidmodel2))


#BMA results tables

#Table SI3.3
invalidmodel4<-cbind(invalidmodel$probne0, invalidmodel$postmean[-1], invalidmodel$postsd[-1])
invalidmodel4 <- invalidmodel4[order(-invalidmodel4[,1]),] 
colnames(invalidmodel4)<-c("PIP", "Post Mean", "Post SD")
print(xtable(invalidmodel4, type='latex', row.names=F), file='invalid.tex', include.rownames=TRUE)

#Table SI3.4
attentionmodel4<-cbind(attentionmodel$probne0, attentionmodel$postmean[-1], attentionmodel$postsd[-1])
attentionmodel4 <- attentionmodel4[order(-attentionmodel4[,1]),] 
colnames(attentionmodel4)<-c("PIP", "Post Mean", "Post SD")
print(xtable(attentionmodel4, type='latex', row.names=F), file='attention.tex', include.rownames=TRUE)

#Table SI.3.5
hoursmodel4<-cbind(hoursmodel$probne0, hoursmodel$postmean[-1], hoursmodel$postsd[-1])
hoursmodel4 <- hoursmodel4[order(-hoursmodel4[,1]),] 
colnames(hoursmodel4)<-c("PIP", "Post Mean", "Post SD")
print(xtable(hoursmodel4, type='latex', row.names=F), file='hours.tex', include.rownames=TRUE)

#Table SI.3.6
HITsmodel4<-cbind(HITsmodel$probne0, HITsmodel$postmean[-1], HITsmodel$postsd[-1])
HITsmodel4 <- HITsmodel4[order(-HITsmodel4[,1]),] 
colnames(HITsmodel4)<-c("PIP", "Post Mean", "Post SD")
print(xtable(HITsmodel4, type='latex', row.names=F), file='hits.tex', include.rownames=TRUE)

#Table SI.3.7
timingmodel4<-cbind(timingmodel$probne0, timingmodel$postmean[-1], timingmodel$postsd[-1])
timingmodel4 <- timingmodel4[order(-timingmodel4[,1]),] 
colnames(timingmodel4)<-c("PIP", "Post Mean", "Post SD")
print(xtable(timingmodel4, type='latex', row.names=F), file='timing.tex', include.rownames=TRUE)

#Table SI.3.8
statemodel4<-cbind(statemodel$probne0, statemodel$postmean[-1], statemodel$postsd[-1])
statemodel4 <- statemodel4[order(-statemodel4[,1]),] 
colnames(statemodel4)<-c("PIP", "Post Mean", "Post SD")
print(xtable(statemodel4, type='latex', row.names=F), file='state.tex', include.rownames=TRUE)


#Figure SI.3.2: Plots

#Attention
#Keep: WorkerFriendships, ShareFrequency, LocationOthersHousehold, StartHelp
plot(attentionmodel, include=1:29, ylim=c(0,1), mfrow=c(1,1), cex.axis=2.5, cex.main=3.5)

#Hours
#Keep: DiscoverFriend, StartHelp
plot(hoursmodel, include=1:29, ylim=c(0,1), mfrow=c(1,1), cex.axis=2.5, cex.main=3.5)

#HITs
#Keep: LocationHome, StartMotivationMoney, ShareFriend, StartHelp, HelpFamily, FindOutFriends
plot(HITsmodel, include=1:29, ylim=c(0,1), mfrow=c(1,1), cex.axis=2.5, cex.main=3.5)

#Timing
#Keep: KnowOnMTurk, LocationOtherWorkers, WorkerFriendships, LocationOthersHousehold
plot(timingmodel, include=1:29, ylim=c(0,1), mfrow=c(1,1), cex.axis=2.5, cex.main=3.5)

#State
#Keep: LocationOthersHousehold
plot(statemodel, include=1:29, ylim=c(0,1), mfrow=c(1,1), cex.axis=2.5, cex.main=3.5)




#Logit model results tables

#Table SI.3.9
stargazer(invalidmodel2, attentionmodel2, hoursmodel2, HITsmodel2, timingmodel2, statemodel2)

#Table SI.3.10
stargazer(invalidmodel3, attentionmodel3, hoursmodel3, HITsmodel3, timingmodel3, statemodel3)

#Figure SI.3.1: Image plot
par(mar=c(5.1, 4.1, 4.1, 2.1), mfrow=c(3,2))
imageplot.bma2(invalidmodel, main="Unexpected Response", order="probne0", cex.axis=0.8)
imageplot.bma2(attentionmodel, main="Attention Check", order="probne0", cex.axis=0.8)
imageplot.bma2(hoursmodel, main="Hours Worked", order="probne0", cex.axis=0.8)
imageplot.bma2(HITsmodel, main="HITsCompleted", order="probne0", cex.axis=0.8)
imageplot.bma2(timingmodel, main="Description", order="probne0", cex.axis=0.8)
imageplot.bma2(statemodel, main="State", order="probne0", cex.axis=0.8)
dev.off()



#Inattentive Response Indicators

#Satisficing
survey$satisficing<-survey$TalkFrequency+survey$WorkerFriendships+
  survey$ShareFrequency
table(survey$satisficing, survey$MasterWorker)
t.test(survey[survey$MasterWorker==0,]$satisficing, survey[survey$MasterWorker==1,]$satisficing)

table(survey$satisficing, survey$Invalid)
t.test(survey[survey$Invalid==0,]$satisficing, survey[survey$Invalid==1,]$satisficing)


#Timing
t.test(survey[survey$MasterWorker==0,]$Duration, survey[survey$MasterWorker==1,]$Duration)
t.test(survey[survey$Invalid==0,]$Duration, survey[survey$Invalid==1,]$Duration)

t.test(survey[survey$MasterWorker==0,]$TimingWords, survey[survey$MasterWorker==1,]$TimingWords)
t.test(survey[survey$Invalid==0,]$TimingWords, survey[survey$Invalid==1,]$TimingWords)

